{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A到均值的欧氏距离为: [[1.41421356]]\n",
      "B到均值的欧氏距离为: [[1.41421356]]\n",
      "A到总体均值的马氏距离为： [1.05263158]\n",
      "B到总体均值的马氏距离为： [0.]\n"
     ]
    }
   ],
   "source": [
    "#11.1\n",
    "\n",
    "'''\n",
    "如果使用scipy模块计算题中的马氏距离，会报错表示数据维度过小\n",
    "所以只能使用python表达数学公式的形式\n",
    "'''\n",
    "'''\n",
    "最终发现两个样本点的欧氏距离一致，但是马氏距离不一致\n",
    "原因在于欧式距离不考虑方差的问题，马氏距离考虑将方差引入欧式距离，所以马氏距离是欧式距离的一种改进\n",
    "'''\n",
    "import numpy as np\n",
    "import scipy.spatial.distance as ssd\n",
    "import scipy\n",
    "u = np.array([0,0]).reshape(1,-1)\n",
    "A = np.array([1,1]).reshape(1,-1)\n",
    "B = np.array([1,-1]).reshape(1,-1)\n",
    "print('A到均值的欧氏距离为:',ssd.cdist(u,A,'euclidean'))\n",
    "print('B到均值的欧氏距离为:',ssd.cdist(u,B,'euclidean'))\n",
    "#下面求马氏距离\n",
    "def mahalanobis(x=None, data=None, cov=None):\n",
    "    \"\"\"Compute the Mahalanobis Distance between each row of x and the data\n",
    "    x    : vector or matrix of data with, say, p columns.\n",
    "    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.\n",
    "    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.\n",
    "    \"\"\"\n",
    "    x_minus_mu = x - np.mean(data)\n",
    "    if cov is None:\n",
    "        cov = np.cov(data.values.T)\n",
    "    inv_covmat = scipy.linalg.inv(cov)\n",
    "    left_term = np.dot(x_minus_mu, inv_covmat)\n",
    "    mahal = np.dot(left_term, x_minus_mu.T)\n",
    "    return mahal.diagonal()\n",
    "u = np.array([0,0]).reshape(1,-1)\n",
    "A = np.array([1,1]).reshape(1,-1)\n",
    "cov1= np.array([[1,0.9],[0.9,1]])\n",
    "print(\"A到总体均值的马氏距离为：\",mahalanobis(u,A,cov = cov1))\n",
    "print(\"B到总体均值的马氏距离为：\",mahalanobis(u,B,cov = cov1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "G1类x1、x2均值分别为： 3.0 4.333333333333333\n",
      "G2类x1、x2均值分别为： 5.0 8.0\n",
      "样本协方差矩阵s1、s2值分别为： [[ 0.5 -1.  -1.5]\n",
      " [-1.   2.   3. ]\n",
      " [-1.5  3.   4.5]] \n",
      "*** [[4.5 3.  6. ]\n",
      " [3.  2.  4. ]\n",
      " [6.  4.  8. ]]\n",
      "马氏距离分类结果: [1.]\n"
     ]
    }
   ],
   "source": [
    "#11.2\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "x0 = np.array([[3,2],[2,4],[4,7],[6,9],[5,7],[4,8]])\n",
    "xpan = np.array([2,7]).reshape(1,-1)\n",
    "g = np.hstack([np.ones(3),2*np.ones(3)])#g为已知样本数据的类别标号\n",
    "print('G1类x1、x2均值分别为：',x0[:3,:1].mean(),x0[:3,1:2].mean())\n",
    "print('G2类x1、x2均值分别为：',x0[3:,:1].mean(),x0[3:,1:2].mean())\n",
    "print('样本协方差矩阵s1、s2值分别为：',np.cov(x0[:3]),'\\n***',np.cov(x0[3:]))\n",
    "'''\n",
    "根据11.1的顺延，应用马氏距离进行数据点的判别\n",
    "'''\n",
    "v = np.cov(x0.T)\n",
    "knn = KNeighborsClassifier(2,metric='mahalanobis',metric_params={'V':v})#马氏距离分类\n",
    "knn.fit(x0,g)\n",
    "pre = knn.predict(xpan)\n",
    "print('马氏距离分类结果:',pre)#1表示G1类别"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "分类结果： [2 2 1]\n",
      "回代误判率： 0.125\n"
     ]
    }
   ],
   "source": [
    "#11.3\n",
    "\n",
    "from scipy.stats import zscore\n",
    "a=np.loadtxt('ti11_3.txt')\n",
    "m=a.shape[0]; b=zscore(a); x=b[8:,:]\n",
    "x0=b[:8,:]\n",
    "y0=np.hstack([np.ones(3),2*np.ones(5)])\n",
    "\n",
    "x1=b[:3,:]; mu1=x1.mean(axis=0)\n",
    "x2=b[3:8,:]; mu2=x2.mean(axis=0); D=[]\n",
    "for i in x:\n",
    "    d1 = np.linalg.norm(i-mu1)\n",
    "    d2 = np.linalg.norm(i-mu2)\n",
    "    D.append([d1, d2])\n",
    "ind1 = np.argmin(D, axis=1)+1\n",
    "print('分类结果：', ind1)\n",
    "check =[]  #存放已知样本点的马氏距离\n",
    "for i in x0:\n",
    "    d1 = np.linalg.norm(i-mu1)\n",
    "    d2 = np.linalg.norm(i-mu2)\n",
    "    check.append([d1, d2])\n",
    "ind2 = np.argmin(check, axis=1)+1\n",
    "rate = sum(y0-ind2)/len(y0) #计算误判率\n",
    "print('回代误判率：', rate)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "特征值为： [3.75512808 2.196736   1.21489177 0.40239979 0.21280241 0.1379589\n",
      " 0.06545781 0.01462524]\n",
      "特征向量为：\n",
      " [[-0.45664998 -0.25882822  0.10966883 -0.32029723 -0.76012186 -0.14896175\n",
      "   0.09454094 -0.03079514]\n",
      " [-0.31309075  0.40360072  0.24615967  0.6439629  -0.10892637 -0.16475097\n",
      "   0.21595855 -0.42208045]\n",
      " [-0.47047522 -0.108706    0.19232226 -0.42633456  0.60825947 -0.1670454\n",
      "   0.31715673 -0.2227441 ]\n",
      " [-0.24059097  0.48740846  0.33381048 -0.25793437 -0.03265346  0.65937764\n",
      "  -0.29796515  0.0442838 ]\n",
      " [ 0.26244047 -0.16996271  0.72275514  0.11307699 -0.0201658   0.05888927\n",
      "   0.41882018  0.43334896]\n",
      " [-0.42456385 -0.28790815  0.19141304  0.32931832  0.19037571 -0.20327557\n",
      "  -0.61456767  0.36946939]\n",
      " [ 0.31973391 -0.40102783  0.39697583 -0.0420793  -0.01349131  0.07657058\n",
      "  -0.36081813 -0.66434301]\n",
      " [-0.25070858 -0.49812515 -0.24971406  0.33380016  0.04972306  0.66131122\n",
      "   0.26787833 -0.06060681]]\n",
      "各主成分的贡献率为： [0.46939101 0.274592   0.15186147 0.05029997 0.0266003  0.01724486\n",
      " 0.00818223 0.00182816]\n"
     ]
    }
   ],
   "source": [
    "#11.4\n",
    "\n",
    "import numpy as np\n",
    "from scipy.stats import zscore\n",
    "import pandas as pd\n",
    "a = np.loadtxt('ti12_3_1.txt')\n",
    "b = zscore(a, ddof=1)          #数据标准化\n",
    "r = np.corrcoef(b.T)   #求相关系数矩阵\n",
    "c, d = np.linalg.eig(r)  #求特征值和特征向量\n",
    "ind = np.argsort(-c)  #特征值从大到小排序的地址\n",
    "cc = c[ind]; dd = d[ind,:]  #重排特征值和特征向量的顺序\n",
    "print('特征值为：', cc)\n",
    "print('特征向量为：\\n', dd)\n",
    "rt = cc/sum(cc)  #计算各主成分的贡献率\n",
    "cr = np.cumsum(rt)  #求累积贡献率\n",
    "print('各主成分的贡献率为：', rt)#可见第一项指标比较重要"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "特征值为： [3.35097027 1.51599708 0.38660056 0.3542329  0.24757787 0.14462131]\n",
      "特征向量为：\n",
      " [[ 0.44696175  0.38113174  0.72002027  0.35171682 -0.08202847 -0.07811029]\n",
      " [ 0.40773446  0.40522194 -0.41138775 -0.31042318 -0.57017021 -0.28080469]\n",
      " [ 0.41013317 -0.35396661  0.16294645 -0.36682381 -0.2228788   0.70406832]\n",
      " [ 0.35549178 -0.53463092 -0.29884206  0.66253644 -0.20810023 -0.12739133]\n",
      " [ 0.41835961  0.36743384 -0.41774048  0.14172746  0.6312124   0.3113595 ]\n",
      " [ 0.40542498 -0.38000468  0.1482083  -0.43166168  0.42043727 -0.5533191 ]]\n",
      "各主成分的贡献率为： [0.55849505 0.25266618 0.06443343 0.05903882 0.04126298 0.02410355]\n"
     ]
    }
   ],
   "source": [
    "#11.5\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "r = np.array([[1, 0.80, 0.37, 0.78, 0.26, 0.38],\n",
    "              [0.80, 1, 0.32, 0.65, 0.18, 0.33],\n",
    "              [0.37, 0.32, 1, 0.36, 0.71, 0.62],\n",
    "              [0.78, 0.65, 0.36, 1, 0.18, 0.39],\n",
    "              [0.26, 0.18, 0.71, 0.18, 1, 0.69],\n",
    "              [0.38, 0.33, 0.62, 0.39, 0.69, 1]])\n",
    "c, d = np.linalg.eig(r)  #求特征值和特征向量\n",
    "ind = np.argsort(-c)     #特征值从大到小排序的地址\n",
    "cc = c[ind]; dd = d[ind,:]  #重排特征值和特征向量的顺序\n",
    "print('特征值为：', cc); print('特征向量为：\\n', dd)\n",
    "rt = cc/sum(cc)     #计算各主成分的贡献率\n",
    "cr = np.cumsum(rt)  #求累积贡献率\n",
    "print('各主成分的贡献率为：', rt) #身高比较重要"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "特征值为： [3.35097027 1.51599708 0.38660056 0.3542329  0.24757787 0.14462131]\n",
      "特征向量为：\n",
      " [[ 0.44696175  0.38113174  0.72002027  0.35171682 -0.08202847 -0.07811029]\n",
      " [ 0.40773446  0.40522194 -0.41138775 -0.31042318 -0.57017021 -0.28080469]\n",
      " [ 0.41013317 -0.35396661  0.16294645 -0.36682381 -0.2228788   0.70406832]\n",
      " [ 0.35549178 -0.53463092 -0.29884206  0.66253644 -0.20810023 -0.12739133]\n",
      " [ 0.41835961  0.36743384 -0.41774048  0.14172746  0.6312124   0.3113595 ]\n",
      " [ 0.40542498 -0.38000468  0.1482083  -0.43166168  0.42043727 -0.5533191 ]]\n",
      "[0.55849505 0.81116123 0.87559465 0.93463347 0.97589645 1.        ]\n",
      "各主成分的贡献率为： [0.55849505 0.25266618 0.06443343 0.05903882 0.04126298 0.02410355]\n",
      "特征值: [3.35097027 1.51599708 0.14462131 0.24757787 0.3542329  0.38660056] \n",
      "载荷矩阵：\n",
      " [[ 0.81819278  0.46927163  0.27381729  0.17500443 -0.04882127 -0.0485668 ]\n",
      " [ 0.74638467  0.4989329  -0.1564471  -0.15445788 -0.33935087 -0.17459652]\n",
      " [ 0.742157   -0.46788393  0.05636229 -0.21478275  0.25023362 -0.34403838]\n",
      " [ 0.7658347   0.45240598 -0.15886298  0.07051961  0.37568164  0.19359465]\n",
      " [ 0.65075101 -0.65826878 -0.11364697  0.32965956 -0.12385599 -0.07920838]\n",
      " [ 0.75077566 -0.43582434  0.06196708 -0.18252125 -0.13265182  0.43777005]] \n",
      "----------\n",
      "对x的贡献为： [3.35097027 1.51599708 0.14462131]\n",
      "共同度为： [0.9646312  0.8304998  0.77288909 0.8164114  0.8697103  0.75744687]\n"
     ]
    }
   ],
   "source": [
    "#11.6\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "r = np.array([[1, 0.80, 0.37, 0.78, 0.26, 0.38],\n",
    "              [0.80, 1, 0.32, 0.65, 0.18, 0.33],\n",
    "              [0.37, 0.32, 1, 0.36, 0.71, 0.62],\n",
    "              [0.78, 0.65, 0.36, 1, 0.18, 0.39],\n",
    "              [0.26, 0.18, 0.71, 0.18, 1, 0.69],\n",
    "              [0.38, 0.33, 0.62, 0.39, 0.69, 1]])\n",
    "c, d = np.linalg.eig(r)  #求特征值和特征向量\n",
    "ind = np.argsort(-c)     #特征值从大到小排序的地址（即返回每个特征值排序的序号）\n",
    "cc = c[ind]; dd = d[ind,:]  #重排特征值和特征向量的顺序\n",
    "print('特征值为：', cc); print('特征向量为：\\n', dd)\n",
    "rt = cc/sum(cc)     #计算各主成分的贡献率\n",
    "cr = np.cumsum(rt)  #求累积贡献率\n",
    "print(cr)\n",
    "print('各主成分的贡献率为：', rt)\n",
    "'''\n",
    "可以看到前三个特征值的累积贡献率为0.87559，超过85%，所以公共因子的个数为3比较合适\n",
    "因子分析表达式即为求出的因子载荷矩阵（（特征值取根号）*特征向量）所构成的表达式\n",
    "身高，体重，胸围对于制定服装标准具有重要影响\n",
    "'''\n",
    "A0=d*np.sqrt(c)       #利用矩阵广播求载荷矩阵\n",
    "print('特征值:',c,'\\n载荷矩阵：\\n',A0,'\\n----------')\n",
    "num=3#公共因子的个数\n",
    "A=A0[:,:num]              #提出num个因子的载荷矩阵\n",
    "Ac=np.sum(A**2, axis=0)   #逐列元素求和，求信息贡献\n",
    "Ar=np.sum(A**2, axis=1)   #逐行元素求和，求共同度\n",
    "print(\"对x的贡献为：\",Ac)\n",
    "print(\"共同度为：\",Ar)\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}